Import and check data
features = read_csv('./feature_extracted_total177.csv')
clinical = read_csv('./outcome_2020_total(1).csv') %>%
mutate(Name = features$name)
#skimr::skim(clinical)
#skimr::skim(features) 0 missing values
pre filter features by corr
#var_0 = features %>%
# filter(pcr == 0)
#var_1 = features %>%
# filter(pcr != 0)
#var = colnames(features)[-c(1:2)] # may change index
#
#sig_vec = vector()
#for (variable in var) {
# #print(variable)
# test.sig= t.test(pull(var_0,variable),pull(var_1,variable))$p.value
# sig_vec = append(sig_vec, test.sig)
#}
#
#sig.total = tibble(
# var = var,
# sig = sig_vec
#) %>%
# filter(sig <0.1)
#
#features_into_account = sig.total$var
#length(features_into_account) #94
#t = scale(features[,3:1220])
# filter the features based on correlation
library(mlbench)
library(ranger)
# calculate correlation matrix
correlationMatrix <- cor(scale(features[,3:1220]))
# summarize the correlation matrix
#print(correlationMatrix)
# find attributes that are highly corrected (ideally >0.75)
highlyCorrelated <- findCorrelation(correlationMatrix, cutoff = 0.55)
# print indexes of highly correlated attributes
#print(highlyCorrelated)
length(highlyCorrelated)
## [1] 1164
correlation matrix
library(ggcorrplot)
# correlation for the all rad-features
#pheatmap::pheatmap(cor.all,cluster_rows = F,cluster_cols = F,show_rownames = F,show_colnames = #F,main = "Corrlation Matrix for All Extracted Radiomic Features") #随缘画的
#heatmap(correlationMatrix,labRow = F,labCol = F,main = "Corrlation Matrix for All Extracted #Radiomic Features", xlab = "The 1 ~ 1219 Originally Extracted Features",ylab = "The 1 ~ 1219 #Originally Extracted Features",dendrogram='none')
cor.lowcorr = cor(features[,-c(1,2,highlyCorrelated)])
colnames(cor.lowcorr) = as.character(seq(1:length(colnames(cor.lowcorr))))
rownames(cor.lowcorr) = as.character(seq(1:length(colnames(cor.lowcorr))))
ggcorrplot(cor.lowcorr,
#type = "lower",
outline.color = "white",
colors = c("#6D9EC1", "white", "#E46726")) +
labs(x = "Features ID",
title = "Correlation Matrix for the 47 Selected Radiomics Features at the First Stage",
y = "Features ID") +
theme(plot.title = element_text(hjust=.5))# selected features

LASSO
library(MLmetrics)
##
## Attaching package: 'MLmetrics'
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
## The following object is masked from 'package:base':
##
## Recall
#features.lasso = features %>% .[features_into_account] #datase for lasso
#x = features.lasso %>% as.matrix() %>% scale()
#y = features$pcr %>% as.factor()
features.lasso = features[,-highlyCorrelated] %>%
.[,-1] %>%
mutate(pcr = features$pcr) %>%
select(pcr,everything())
x = features.lasso %>% .[,-1] %>% as.matrix() %>% scale()
y = features$pcr %>% as.factor()
levels(y) = c("N","Y")
ctrl1 = trainControl(method = "cv", number = 10,
summaryFunction = prSummary,
# prSummary needs calculated class probs
classProbs = T)
set.seed(888)
lasso.fit = train(x, y,method = "glmnet",
tuneGrid = expand.grid(alpha = 1,lambda =exp(seq(-10, -5, length=1000))),
#preProc = c("center", "scale"),
metric = "ROC",
trControl = ctrl1)
lasso.fit$bestTune
## alpha lambda
## 151 1 9.618384e-05
#lasso.fit$finalModel$beta
ggplot(lasso.fit,highlight = T) +
theme_bw() +
labs(title = "Variable Selection Process via Logistic LASSO Regression") +
theme(plot.title = element_text(hjust = .5))

#trace plots for lasso variable selection
library(glmnet)
plot.glmnet(lasso.fit$finalModel, xvar = "lambda", label = T,
main = "Trace Plot for the Coefficients")

lasso.fit$bestTune
## alpha lambda
## 151 1 9.618384e-05
calculate radscores
coef = data.frame(as.matrix(coef(lasso.fit$finalModel,lasso.fit$bestTune$lambda)))
coef = coef %>%
mutate(coef = rownames(coef)) %>%
rename("value" = "X1") %>%
filter(value !=0)
coef %>%
filter(value !=0) %>%
nrow() # number of useful predictors
## [1] 55
# The selected predictors are:
predictors = coef$coef[-1] #plus intercept 1+12
# the next step is to calculate radscores:
predictors_val = coef$value[-1]
# the next step is to calculate radscores:
feature.matrix = features[predictors] %>% as.matrix() %>% scale()
coef.matrix = predictors_val %>% as.matrix()
radscore = feature.matrix %*% coef.matrix + coef$value[1]
# construct a new dataset for future prediction
data.pred = clinical %>%
mutate(radscore = as.vector(radscore),
PCR = factor(PCR))
write.csv(data.pred,"./CTscore177.csv")
check radscore
data.pred %>%
mutate(id = 1:nrow(data.pred)) %>%
ggplot(aes(x = PCR,
y = radscore, fill = PCR)) +
geom_boxplot() +
theme_bw()

data.pred %>%
arrange(PCR) %>%
mutate(id = 1:nrow(data.pred)) %>%
ggplot(aes(x = reorder(id,radscore), y = radscore,
fill = PCR)) +
geom_col() +
theme_classic() +
labs(title = "Radscores for the 177 Patients",
x = "Patients",
y = "Radscore") +
theme(plot.title = element_text(hjust = .5),
axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
ggsci::scale_fill_jama(labels = c("PCR = 0", "PCR = 1"))

organizing data
#rf.score = read_csv("../rf.score_simple.csv") %>% as.matrix() %>% .[,-1]
data.pred = data.pred %>%
mutate(Age = factor(Age),
cT = ifelse(cT == 2, 3, cT),
cT = factor(cT),
cN = factor(cN),
cTNM = factor(cTNM),
MRF = as.character(MRF),
`Tumor length` = factor(`Tumor length`),
Distance = factor(Distance),
CEA = factor(CEA),
Differentiation = factor(Differentiation),
Group = ifelse(Group == "C", "B", Group)
#rf.score = rf.score
) # 有3个2,这样很可能导致无法交叉验证,先变成3看看
#2020.5.10 加入了rf.score
str(data.pred$PCR)
## Factor w/ 2 levels "0","1": 1 1 2 1 1 2 1 1 1 1 ...
reg
set.seed(888)
ctrl = trainControl(method = "repeatedcv", number = 10, repeats = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary)
levels(data.pred$PCR) = c("N", "Y")
rowTrain = createDataPartition(y = data.pred$PCR,
p = 2/3, # 划分训练验证
list = F)
rad.fit = train(x = data.pred[rowTrain,c(2:13,15)], # radscore and others
y = data.pred$PCR[rowTrain],
method = "glm",
metric = "ROC",
trControl = ctrl
) #radscore and others (logistic
x.lasso = model.matrix(PCR ~. ,data.pred[rowTrain,c(2:15)])
y.lasso = data.pred[rowTrain,c(2:15)]$PCR
lasso.predict.fit = train(x = x.lasso, # radscore with others (lasso)
y = y.lasso,
method = "glmnet",
tuneGrid = expand.grid(alpha = 1,lambda = exp(seq(-5, 5, length=1000))),
preProc = c("center", "scale"),
trControl = ctrl)
rad_and_group.fit = train(x = data.pred[rowTrain,c(4,15)], # only radscore and group(logistic)
y = data.pred$PCR[rowTrain],
method = "glm",
metric = "ROC",
trControl = ctrl
)
all_other.fit = train(x = data.pred[rowTrain,c(2:13)], # only others
y = data.pred$PCR[rowTrain],
method = "glm",
metric = "ROC",
trControl = ctrl
)
test.pred.prob = predict(rad.fit, newdata = data.pred[-rowTrain,] , type = "prob")[,2]
test.pred = rep("N", length(test.pred.prob ))
test.pred[test.pred.prob > 0.5] = "Y"
caret::confusionMatrix(data = factor(test.pred,levels = c("N","Y")),
reference = data.pred$PCR[-rowTrain],
positive = "Y")
## Confusion Matrix and Statistics
##
## Reference
## Prediction N Y
## N 43 5
## Y 5 5
##
## Accuracy : 0.8276
## 95% CI : (0.7057, 0.9141)
## No Information Rate : 0.8276
## P-Value [Acc > NIR] : 0.5834
##
## Kappa : 0.3958
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.50000
## Specificity : 0.89583
## Pos Pred Value : 0.50000
## Neg Pred Value : 0.89583
## Prevalence : 0.17241
## Detection Rate : 0.08621
## Detection Prevalence : 0.17241
## Balanced Accuracy : 0.69792
##
## 'Positive' Class : Y
##
random forest
rf.grid = expand.grid(mtry = 1:6,
splitrule = "gini",
min.node.size = 20:30
)
set.seed(888)
rf.fit = train(PCR ~ ., data.pred[rowTrain,-1],
#subset = rowTrain,
method = "ranger",
tuneGrid = rf.grid,
metric = "ROC",
trControl = ctrl)
#rf.fit = train(PCR ~ ., data.pred[rowTrain,-1],
# #subset = rowTrain,
# method = "rf",
# ntree = 1000,
# tuneGrid = data.frame(mtry = 1:6),
# metric = "ROC",
# trControl = ctrl)
rf.fit$bestTune
## mtry splitrule min.node.size
## 56 6 gini 20
ggplot(rf.fit, highlight = TRUE) +
theme_minimal() +
labs(title = "Tuning Parameters for Random Forest") +
theme(plot.title = element_text(hjust = .5)) +
ggsci::scale_color_lancet()
## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.

set.seed(888)
data.pred.rf = data.pred
colnames(data.pred.rf) = make.names(colnames(data.pred.rf))
rf.final = ranger(PCR ~ .,
data.pred.rf[rowTrain,-1],
mtry = 6,
splitrule = "gini",
min.node.size = 29,
importance = "impurity")
# var importance
vip::vip(rf.final) + theme_bw() + ggtitle("Variable Importance - RF")

barplot(sort(ranger::importance(rf.final), decreasing = FALSE),
las = 2,
horiz = TRUE,
cex.names = 0.6,
col = colorRampPalette(colors = c("black","gold"))(20))

# explain prediction
# fixed a small bug
library(lime)
##
## Attaching package: 'lime'
## The following object is masked from 'package:dplyr':
##
## explain
new_obs = data.pred[-rowTrain,-c(1)][1:2,]
explainer.rf = lime(data.pred[rowTrain,-c(1)], rf.fit)
explanation.rf = lime::explain(new_obs, explainer.rf, n_features = 8, labels = "Y")
plot_features(explanation.rf)

plot_explanations(explanation.rf)

#attention!!! => actual result: case 1 is N, case 2 is Y
boosting
gbmB.grid = expand.grid(n.trees = c(3500,4000,4500),
interaction.depth = 1:4,
shrinkage = c(0.0008,0.001,0.0015),
n.minobsinnode = 10:20) # 2 originally
set.seed(888)
# Binomial loss function
gbmB.fit = train(PCR ~ ., data.pred[rowTrain,-1],
tuneGrid = gbmB.grid,
trControl = ctrl,
method = "gbm",
distribution = "bernoulli",
metric = "ROC",
verbose = FALSE)
ggplot(gbmB.fit, highlight = T) +
theme_minimal() +
labs(title = "Tuning Parameters for Gradient Boosting Model") +
theme(plot.title = element_text(hjust = .5)) #+

#ggsci::scale_color_lancet()
gbmB.fit$bestTune
## n.trees interaction.depth shrinkage n.minobsinnode
## 2 4000 1 8e-04 10
vip::vip(gbmB.fit$finalModel) + theme_bw() + ggtitle("Variable Importance - GBM")

summary(gbmB.fit$finalModel, las = 2, cBars = 19, cex.names = 0.6)

## var rel.inf
## radscore radscore 89.91369181
## `Tumor thickness` `Tumor thickness` 5.95875618
## GroupB GroupB 2.96606778
## Age1 Age1 0.28694248
## CEA1 CEA1 0.24241520
## Differentiation1 Differentiation1 0.18522160
## `Tumor length`1 `Tumor length`1 0.12323893
## Distance1 Distance1 0.12047243
## MRF1 MRF1 0.08252402
## SexMale SexMale 0.04490061
## cT4 cT4 0.04110177
## cN1 cN1 0.02046010
## cTNM3 cTNM3 0.01420709
explainer.gbm = lime(data.pred[rowTrain,-1], gbmB.fit)
explanation.gbm = explain(new_obs, explainer.gbm, n_features = 8,labels = "Y")
plot_features(explanation.gbm)

Decision Tree
#data.pred.tree = data.pred
#colnames(data.pred.tree) = make.names(colnames(data.pred.tree))
#library(rpart.plot)
#tree.fit = train(PCR ~ ., data.pred.tree[rowTrain,-1],
# method = "rpart",
# metric = "ROC",
# trControl = ctrl,
# tuneGrid = data.frame(cp = exp(seq(-9,-1, len = 100))))
#
#ggplot(tree.fit,highlight = T)
#rpart.plot(tree.fit$finalModel)
resample
res = resamples(list(only_radscore = rad.fit,
rad_and_group = rad_and_group.fit,
all_other = all_other.fit,
rf = rf.fit,
gbmB = gbmB.fit),
metric = "ROC")
summary(res)
##
## Call:
## summary.resamples(object = res)
##
## Models: only_radscore, rad_and_group, all_other, rf, gbmB
## Number of resamples: 50
##
## ROC
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## only_radscore 0.30 0.8888889 0.9500000 0.9093333 1.00 1 0
## rad_and_group 0.55 0.9111111 1.0000000 0.9322222 1.00 1 0
## all_other 0.00 0.4000000 0.5250000 0.5326667 0.65 1 0
## rf 0.55 0.9000000 0.9833333 0.9151111 1.00 1 0
## gbmB 0.55 0.8916667 0.9833333 0.9330370 1.00 1 0
##
## Sens
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## only_radscore 0.7000000 0.9 1 0.9564444 1 1 0
## rad_and_group 0.7777778 1.0 1 0.9708889 1 1 0
## all_other 0.7777778 0.9 1 0.9551111 1 1 0
## rf 0.8000000 1.0 1 0.9815556 1 1 0
## gbmB 0.7000000 0.9 1 0.9326667 1 1 0
##
## Spec
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## only_radscore 0 0.5 0.5000000 0.5733333 1 1.0000000 0
## rad_and_group 0 0.5 1.0000000 0.7200000 1 1.0000000 0
## all_other 0 0.0 0.0000000 0.0600000 0 0.6666667 0
## rf 0 0.5 0.5000000 0.6166667 1 1.0000000 0
## gbmB 0 0.5 0.5833333 0.6933333 1 1.0000000 0
bwplot(res)

radscores by groups
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:glmnet':
##
## auc
## The following object is masked from 'package:ModelMetrics':
##
## auc
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
roc = roc(data.pred$PCR,data.pred$radscore)
## Setting levels: control = N, case = Y
## Setting direction: controls < cases
plot(roc,
auc.polygon = TRUE,
grid=c(0.1, 0.2),
grid.col = c("green", "red"),
max.auc.polygon = TRUE,
auc.polygon.col = "skyblue",
print.thres=TRUE)

# thre_value = -1.270
data.tbl = data.pred %>%
mutate(radscore_cat = ifelse(radscore >= -1.270, "High", "Low"))
table 1 grouped by radscores
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
label = rep("Primary Cohort",177)
label[-rowTrain] = "Validation Cohort"
data.tbl = data.tbl %>%
mutate(cohort = label)
primary = data.tbl[rowTrain,]
validation = data.tbl[-rowTrain,]
control = arsenal::tableby.control(test = T,
numeric.stats = c("meansd","medianq1q3","range"),
#cat.stats = c("countrowpct"),
ordered.stats = c("Nmiss"),
digits = 2)
tbl_primary = arsenal::tableby(PCR~ Age + Sex + Group + cT + cN +cTNM + MRF + `Tumor length` + `Tumor thickness` + Distance + CEA + Differentiation + radscore, data = primary,control = control)
tbl_validation = arsenal::tableby(PCR~ Age + Sex + Group + cT + cN +cTNM + MRF + `Tumor length` + `Tumor thickness` + Distance + CEA + Differentiation + radscore, data = validation, control = control)
summary(tbl_primary,text = T) %>%
knitr::kable(booktabs = T, caption = "Descriptive Statistics") %>%
kable_styling(latex_options = c("striped", "hold_position"))
Descriptive Statistics
|
|
N (N=98)
|
Y (N=21)
|
Total (N=119)
|
p value
|
|
Age
|
|
|
|
0.783
|
|
|
67 (68.4%)
|
15 (71.4%)
|
82 (68.9%)
|
|
|
|
31 (31.6%)
|
6 (28.6%)
|
37 (31.1%)
|
|
|
Sex
|
|
|
|
0.926
|
|
|
29 (29.6%)
|
6 (28.6%)
|
35 (29.4%)
|
|
|
|
69 (70.4%)
|
15 (71.4%)
|
84 (70.6%)
|
|
|
Group
|
|
|
|
0.040
|
|
|
31 (31.6%)
|
2 (9.5%)
|
33 (27.7%)
|
|
|
|
67 (68.4%)
|
19 (90.5%)
|
86 (72.3%)
|
|
|
cT
|
|
|
|
0.492
|
|
|
85 (86.7%)
|
17 (81.0%)
|
102 (85.7%)
|
|
|
|
13 (13.3%)
|
4 (19.0%)
|
17 (14.3%)
|
|
|
cN
|
|
|
|
0.594
|
|
|
24 (24.5%)
|
4 (19.0%)
|
28 (23.5%)
|
|
|
|
74 (75.5%)
|
17 (81.0%)
|
91 (76.5%)
|
|
|
cTNM
|
|
|
|
0.531
|
|
|
25 (25.5%)
|
4 (19.0%)
|
29 (24.4%)
|
|
|
|
73 (74.5%)
|
17 (81.0%)
|
90 (75.6%)
|
|
|
MRF
|
|
|
|
0.735
|
|
|
69 (70.4%)
|
14 (66.7%)
|
83 (69.7%)
|
|
|
|
29 (29.6%)
|
7 (33.3%)
|
36 (30.3%)
|
|
|
Tumor length
|
|
|
|
0.478
|
|
|
21 (21.4%)
|
6 (28.6%)
|
27 (22.7%)
|
|
|
|
77 (78.6%)
|
15 (71.4%)
|
92 (77.3%)
|
|
|
Tumor thickness
|
|
|
|
0.054
|
|
|
15.70 (6.53)
|
19.00 (9.20)
|
16.28 (7.14)
|
|
|
|
15.00 (11.25, 18.00)
|
15.00 (13.00, 22.00)
|
15.00 (12.00, 19.00)
|
|
|
|
8.00 - 51.00
|
12.00 - 47.00
|
8.00 - 51.00
|
|
|
Distance
|
|
|
|
0.401
|
|
|
37 (37.8%)
|
10 (47.6%)
|
47 (39.5%)
|
|
|
|
61 (62.2%)
|
11 (52.4%)
|
72 (60.5%)
|
|
|
CEA
|
|
|
|
0.567
|
|
|
67 (68.4%)
|
13 (61.9%)
|
80 (67.2%)
|
|
|
|
31 (31.6%)
|
8 (38.1%)
|
39 (32.8%)
|
|
|
Differentiation
|
|
|
|
0.226
|
|
|
42 (42.9%)
|
6 (28.6%)
|
48 (40.3%)
|
|
|
|
56 (57.1%)
|
15 (71.4%)
|
71 (59.7%)
|
|
|
radscore
|
|
|
|
< 0.001
|
|
|
-4.33 (4.34)
|
0.51 (2.37)
|
-3.47 (4.45)
|
|
|
|
-3.39 (-5.35, -2.23)
|
0.67 (-0.71, 1.82)
|
-2.94 (-5.03, -1.34)
|
|
|
|
-37.69 - 0.07
|
-6.15 - 5.38
|
-37.69 - 5.38
|
|
summary(tbl_validation,text = T) %>%
knitr::kable(booktabs = T, caption = "Descriptive Statistics") %>%
kable_styling(latex_options = c("striped", "hold_position"))
Descriptive Statistics
|
|
N (N=48)
|
Y (N=10)
|
Total (N=58)
|
p value
|
|
Age
|
|
|
|
0.318
|
|
|
32 (66.7%)
|
5 (50.0%)
|
37 (63.8%)
|
|
|
|
16 (33.3%)
|
5 (50.0%)
|
21 (36.2%)
|
|
|
Sex
|
|
|
|
0.882
|
|
|
18 (37.5%)
|
4 (40.0%)
|
22 (37.9%)
|
|
|
|
30 (62.5%)
|
6 (60.0%)
|
36 (62.1%)
|
|
|
Group
|
|
|
|
0.414
|
|
|
13 (27.1%)
|
4 (40.0%)
|
17 (29.3%)
|
|
|
|
35 (72.9%)
|
6 (60.0%)
|
41 (70.7%)
|
|
|
cT
|
|
|
|
0.841
|
|
|
37 (77.1%)
|
8 (80.0%)
|
45 (77.6%)
|
|
|
|
11 (22.9%)
|
2 (20.0%)
|
13 (22.4%)
|
|
|
cN
|
|
|
|
0.113
|
|
|
10 (20.8%)
|
0 (0.0%)
|
10 (17.2%)
|
|
|
|
38 (79.2%)
|
10 (100.0%)
|
48 (82.8%)
|
|
|
cTNM
|
|
|
|
0.359
|
|
|
11 (22.9%)
|
1 (10.0%)
|
12 (20.7%)
|
|
|
|
37 (77.1%)
|
9 (90.0%)
|
46 (79.3%)
|
|
|
MRF
|
|
|
|
0.743
|
|
|
36 (75.0%)
|
7 (70.0%)
|
43 (74.1%)
|
|
|
|
12 (25.0%)
|
3 (30.0%)
|
15 (25.9%)
|
|
|
Tumor length
|
|
|
|
0.414
|
|
|
13 (27.1%)
|
4 (40.0%)
|
17 (29.3%)
|
|
|
|
35 (72.9%)
|
6 (60.0%)
|
41 (70.7%)
|
|
|
Tumor thickness
|
|
|
|
0.417
|
|
|
15.69 (5.41)
|
14.20 (4.16)
|
15.43 (5.21)
|
|
|
|
15.00 (12.75, 18.00)
|
13.50 (11.25, 16.00)
|
15.00 (12.00, 17.75)
|
|
|
|
7.50 - 35.00
|
9.00 - 22.00
|
7.50 - 35.00
|
|
|
Distance
|
|
|
|
0.462
|
|
|
18 (37.5%)
|
5 (50.0%)
|
23 (39.7%)
|
|
|
|
30 (62.5%)
|
5 (50.0%)
|
35 (60.3%)
|
|
|
CEA
|
|
|
|
0.653
|
|
|
30 (62.5%)
|
7 (70.0%)
|
37 (63.8%)
|
|
|
|
18 (37.5%)
|
3 (30.0%)
|
21 (36.2%)
|
|
|
Differentiation
|
|
|
|
0.810
|
|
|
22 (45.8%)
|
5 (50.0%)
|
27 (46.6%)
|
|
|
|
26 (54.2%)
|
5 (50.0%)
|
31 (53.4%)
|
|
|
radscore
|
|
|
|
< 0.001
|
|
|
-4.22 (3.22)
|
0.38 (1.36)
|
-3.43 (3.46)
|
|
|
|
-3.62 (-5.96, -2.29)
|
0.09 (-0.58, 1.18)
|
-3.04 (-5.09, -1.17)
|
|
|
|
-16.75 - 1.59
|
-1.22 - 2.85
|
-16.75 - 2.85
|
|
Making ROC curves for logistic regression Models
for training set
# rad.fit ROC (radscore and others)
rad.fit.pred.train.prob = predict(rad.fit,newdata = data.pred[rowTrain,],type = "prob")[,2]# select pos resp
rad.fit.pred.train = rep("N",length(rad.fit.pred.train.prob))
rad.fit.pred.train[rad.fit.pred.train.prob > 0.5] = "Y"
roc.rad.fit = roc(data.pred$PCR[rowTrain], rad.fit.pred.train.prob)
## Setting levels: control = N, case = Y
## Setting direction: controls < cases
plot(roc.rad.fit,
legacy.axes = T,
print.auc = T,
print.thres = T)

ci.auc(roc.rad.fit)
## 95% CI: 0.8945-1 (DeLong)
#plot(smooth(rad_and_group.fit.fit),
#add = T,
#col = 4)
# only radscore and groups
rad_group.fit.pred.train.prob = predict(rad_and_group.fit ,newdata = data.pred[rowTrain,],type = "prob")[,2]# select pos resp
rad_group.fit.pred.train = rep("N",length(rad_group.fit.pred.train.prob))
rad_group.fit.pred.train[rad_group.fit.pred.train.prob > 0.5] = "Y"
roc.rad_group.fit = roc(data.pred$PCR[rowTrain], rad_group.fit.pred.train.prob)
## Setting levels: control = N, case = Y
## Setting direction: controls < cases
plot(roc.rad_group.fit,
legacy.axes = T,
print.auc = T,
print.thres = T)

ci.auc(roc.rad_group.fit)
## 95% CI: 0.8565-1 (DeLong)
# RF train ROC
rf.fit.pred.train.prob = predict(rf.fit,newdata = data.pred[rowTrain,-1],type = "prob")[,2]# select pos resp
rf.fit.pred.train = rep("N",length(rf.fit.pred.train.prob))
rf.fit.pred.train[rf.fit.pred.train.prob > 0.5] = "Y"
roc.rf.fit = roc(data.pred$PCR[rowTrain], rf.fit.pred.train.prob)
## Setting levels: control = N, case = Y
## Setting direction: controls < cases
plot(roc.rf.fit,
legacy.axes = T,
print.auc = T,
print.thres = T)

ci.auc(roc.rf.fit)
## 95% CI: 0.9772-1 (DeLong)
# Boosting train ROC
gbmB.fit.pred.train.prob = predict(gbmB.fit, newdata = data.pred[rowTrain,-1], type = "prob")[,2]# select pos resp
gbmB.fit.pred.train = rep("N",length(gbmB.fit.pred.train.prob))
gbmB.fit.pred.train[gbmB.fit.pred.train.prob > 0.5] = "Y"
roc.gbmB.fit = roc(data.pred$PCR[rowTrain], gbmB.fit.pred.train.prob)
## Setting levels: control = N, case = Y
## Setting direction: controls < cases
plot(roc.gbmB.fit,
legacy.axes = T,
print.auc = T)

ci.auc(roc.gbmB.fit)
## 95% CI: 0.921-0.9973 (DeLong)
plot(roc.rad.fit, print.auc=TRUE,print.auc.x=0.3,print.auc.y=0.1,auc.polygon.col="gray", grid = c(0.5, 0.3), max.auc.polygon=TRUE,legen = T,main = "Primary Cohort")
plot.roc(roc.rf.fit,add=T,col="red", print.auc=TRUE,print.auc.x=0.3,print.auc.y=0.2)
plot.roc(roc.gbmB.fit,add=T,col="blue",print.auc=TRUE,print.auc.x=0.3,print.auc.y=0.3,)
plot.roc(roc.rad_group.fit,add=T,col="yellow",print.auc=TRUE,print.auc.x=0.3,print.auc.y=0.4)
legend("bottomright",legend = c("radscore with other clinical variables",
"radscore with `group`",
"random forest",
"gradient boosting model",
"SVM"),
col = c("black","yellow","red","blue"),lwd = 4,cex = 0.35)

for validation set
# rad.fit ROC (radscore and others)
rad.fit.pred.test.prob = predict(rad.fit,newdata = data.pred[-rowTrain,],type = "prob")[,2]# select pos resp
rad.fit.pred.test = rep("N",length(rad.fit.pred.test.prob))
rad.fit.pred.test[rad.fit.pred.test.prob > 0.5] = "Y"
roc.rad.fit.test = roc(data.pred$PCR[-rowTrain], rad.fit.pred.test.prob)
## Setting levels: control = N, case = Y
## Setting direction: controls < cases
plot(roc.rad.fit.test,
legacy.axes = T,
print.auc = T,
print.thres = T)

ci.auc(roc.rad.fit.test)
## 95% CI: 0.6934-1 (DeLong)
#plot(smooth(rad_and_group.fit.fit),
#add = T,
#col = 4)
# only radscore and groups
rad_group.fit.pred.test.prob = predict(rad_and_group.fit ,newdata = data.pred[-rowTrain,],type = "prob")[,2]# select pos resp
rad_group.fit.pred.test = rep("N",length(rad_group.fit.pred.test.prob))
rad_group.fit.pred.test[rad_group.fit.pred.test.prob > 0.5] = "Y"
roc.rad_group.fit.test = roc(data.pred$PCR[-rowTrain], rad_group.fit.pred.test.prob)
## Setting levels: control = N, case = Y
## Setting direction: controls < cases
plot(roc.rad_group.fit.test,
legacy.axes = T,
print.auc = T,
print.thres = T)

ci.auc(roc.rad_group.fit.test)
## 95% CI: 0.9051-1 (DeLong)
# RF train ROC
rf.fit.pred.test.prob = predict(rf.fit,newdata = data.pred[-rowTrain,-1],type = "prob")[,2]# select pos resp
rf.fit.pred.test = rep("N",length(rf.fit.pred.test.prob))
rf.fit.pred.test[rf.fit.pred.test.prob > 0.5] = "Y"
roc.rf.fit.test = roc(data.pred$PCR[-rowTrain], rf.fit.pred.test.prob)
## Setting levels: control = N, case = Y
## Setting direction: controls < cases
plot(roc.rf.fit.test,
legacy.axes = T,
print.auc = T,
print.thres = T)

ci.auc(roc.rf.fit.test)
## 95% CI: 0.9007-1 (DeLong)
# Boosting test ROC
gbmB.fit.pred.test.prob = predict(gbmB.fit,newdata = data.pred[-rowTrain,-1], type = "prob")[,2]# select pos resp
gbmB.fit.pred.test = rep("N",length(gbmB.fit.pred.test.prob))
gbmB.fit.pred.test[gbmB.fit.pred.test.prob > 0.5] = "Y"
roc.gbmB.fit.test = roc(data.pred$PCR[-rowTrain], gbmB.fit.pred.test.prob)
## Setting levels: control = N, case = Y
## Setting direction: controls < cases
plot(roc.gbmB.fit.test,
legacy.axes = T,
print.auc = T)

ci.auc(roc.gbmB.fit.test)
## 95% CI: 0.9065-1 (DeLong)
plot(roc.rad.fit.test, print.auc=TRUE,print.auc.x = 0.3,print.auc.y= 0.1,grid = c(0.5, 0.3), max.auc.polygon=TRUE,legen = T,main = "Validation Cohort")
plot.roc(roc.rf.fit.test,add=T,col="red", print.auc=TRUE,print.auc.x=0.3,print.auc.y=0.2)
plot.roc(roc.gbmB.fit.test,add=T,col="blue",print.auc=TRUE,print.auc.x=0.3,print.auc.y=0.3,)
plot.roc(roc.rad_group.fit.test,add=T,col="yellow",print.auc=TRUE,print.auc.x=0.3,print.auc.y=0.4)
legend("bottomright",legend = c("radscore with other clinical variables",
"radscore with `group`",
"random forest",
"gradient boosting model",
"SVM"),
col = c("black","yellow","red","blue"),lwd = 4,cex = 0.35)

logsitic regression’s caliberation & HL test
#?calibration()
trainProbs = data.frame(
obs = data.pred$PCR[rowTrain],
rad = 1-rad.fit.pred.train.prob,
rf = 1-rf.fit.pred.train.prob,
rad_and_group = 1-rad_group.fit.pred.train.prob,
gbm = 1-gbmB.fit.pred.train.prob)
testProbs = data.frame(
obs = data.pred$PCR[-rowTrain],
rad = 1-rad.fit.pred.test.prob,
rf = 1-rf.fit.pred.test.prob,
rad_and_group = 1-rad_group.fit.pred.test.prob,
gbm = 1-gbmB.fit.pred.test.prob
)
cal.int = calibration(obs ~ rad +
rf +
rad_and_group + gbm,
data = trainProbs)
cal.ext = calibration(obs ~ rad +
rf + rad_and_group +gbm
, data = testProbs)
plot(cal.int,type = "l",auto.key = list(columns = 4,
lines = TRUE,
points = FALSE),
main = "Calibration Curves for Primary Cohort")

plot(cal.ext,type = "l",auto.key = list(columns = 4,
lines = TRUE,
points = FALSE),
main = "Calibration Curves for Validation Cohort")

results of the logsitic regression
# radscore and other variables
summary(rad.fit$finalModel)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1176 -0.3160 -0.0614 -0.0039 3.6809
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.95194 2.54104 -1.949 0.05132 .
## Age1 0.81823 0.99107 0.826 0.40903
## SexMale 0.01206 1.04495 0.012 0.99079
## GroupB 3.98457 1.64768 2.418 0.01559 *
## cT4 0.85291 1.79913 0.474 0.63545
## cN1 -8.46698 2399.54616 -0.004 0.99718
## cTNM3 10.13855 2399.54576 0.004 0.99663
## MRF1 -1.17660 1.56353 -0.753 0.45173
## `Tumor length`1 -1.79580 1.27591 -1.407 0.15929
## `Tumor thickness` 0.12757 0.08174 1.561 0.11856
## Distance1 -0.33524 0.99333 -0.337 0.73575
## CEA1 1.58655 1.04535 1.518 0.12909
## Differentiation1 -0.98389 0.92295 -1.066 0.28642
## radscore 1.27366 0.33449 3.808 0.00014 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 110.908 on 118 degrees of freedom
## Residual deviance: 43.837 on 105 degrees of freedom
## AIC: 71.837
##
## Number of Fisher Scoring iterations: 15
# radscore and group
summary(rad_and_group.fit$finalModel)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4319 -0.3875 -0.1122 -0.0136 3.5259
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.5485 0.9728 -1.592 0.1114
## GroupB 2.0499 1.0634 1.928 0.0539 .
## radscore 1.0916 0.2495 4.374 1.22e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 110.908 on 118 degrees of freedom
## Residual deviance: 53.267 on 116 degrees of freedom
## AIC: 59.267
##
## Number of Fisher Scoring iterations: 7
Sen,Spec,PPV,NPV
# validation results
caret::confusionMatrix(factor(rad.fit.pred.test),reference = data.pred$PCR[-rowTrain],
positive = "Y")
## Confusion Matrix and Statistics
##
## Reference
## Prediction N Y
## N 43 5
## Y 5 5
##
## Accuracy : 0.8276
## 95% CI : (0.7057, 0.9141)
## No Information Rate : 0.8276
## P-Value [Acc > NIR] : 0.5834
##
## Kappa : 0.3958
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.50000
## Specificity : 0.89583
## Pos Pred Value : 0.50000
## Neg Pred Value : 0.89583
## Prevalence : 0.17241
## Detection Rate : 0.08621
## Detection Prevalence : 0.17241
## Balanced Accuracy : 0.69792
##
## 'Positive' Class : Y
##
caret::confusionMatrix(factor(rad_group.fit.pred.test),reference = data.pred$PCR[-rowTrain],
positive = "Y")
## Confusion Matrix and Statistics
##
## Reference
## Prediction N Y
## N 46 5
## Y 2 5
##
## Accuracy : 0.8793
## 95% CI : (0.767, 0.9501)
## No Information Rate : 0.8276
## P-Value [Acc > NIR] : 0.1949
##
## Kappa : 0.5201
##
## Mcnemar's Test P-Value : 0.4497
##
## Sensitivity : 0.50000
## Specificity : 0.95833
## Pos Pred Value : 0.71429
## Neg Pred Value : 0.90196
## Prevalence : 0.17241
## Detection Rate : 0.08621
## Detection Prevalence : 0.12069
## Balanced Accuracy : 0.72917
##
## 'Positive' Class : Y
##
caret::confusionMatrix(factor(rf.fit.pred.test),reference = data.pred$PCR[-rowTrain],
positive = "Y")
## Confusion Matrix and Statistics
##
## Reference
## Prediction N Y
## N 47 5
## Y 1 5
##
## Accuracy : 0.8966
## 95% CI : (0.7883, 0.9611)
## No Information Rate : 0.8276
## P-Value [Acc > NIR] : 0.1073
##
## Kappa : 0.5693
##
## Mcnemar's Test P-Value : 0.2207
##
## Sensitivity : 0.50000
## Specificity : 0.97917
## Pos Pred Value : 0.83333
## Neg Pred Value : 0.90385
## Prevalence : 0.17241
## Detection Rate : 0.08621
## Detection Prevalence : 0.10345
## Balanced Accuracy : 0.73958
##
## 'Positive' Class : Y
##
caret::confusionMatrix(factor(gbmB.fit.pred.test),reference = data.pred$PCR[-rowTrain],
positive = "Y")
## Confusion Matrix and Statistics
##
## Reference
## Prediction N Y
## N 46 4
## Y 2 6
##
## Accuracy : 0.8966
## 95% CI : (0.7883, 0.9611)
## No Information Rate : 0.8276
## P-Value [Acc > NIR] : 0.1073
##
## Kappa : 0.6063
##
## Mcnemar's Test P-Value : 0.6831
##
## Sensitivity : 0.6000
## Specificity : 0.9583
## Pos Pred Value : 0.7500
## Neg Pred Value : 0.9200
## Prevalence : 0.1724
## Detection Rate : 0.1034
## Detection Prevalence : 0.1379
## Balanced Accuracy : 0.7792
##
## 'Positive' Class : Y
##
#http://vassarstats.net/clin1.html#return
# primary results
caret::confusionMatrix(factor(rad.fit.pred.train),reference = data.pred$PCR[rowTrain],
positive = "Y")
## Confusion Matrix and Statistics
##
## Reference
## Prediction N Y
## N 95 5
## Y 3 16
##
## Accuracy : 0.9328
## 95% CI : (0.8718, 0.9705)
## No Information Rate : 0.8235
## P-Value [Acc > NIR] : 0.0004702
##
## Kappa : 0.7597
##
## Mcnemar's Test P-Value : 0.7236736
##
## Sensitivity : 0.7619
## Specificity : 0.9694
## Pos Pred Value : 0.8421
## Neg Pred Value : 0.9500
## Prevalence : 0.1765
## Detection Rate : 0.1345
## Detection Prevalence : 0.1597
## Balanced Accuracy : 0.8656
##
## 'Positive' Class : Y
##
caret::confusionMatrix(factor(rad_group.fit.pred.train),reference = data.pred$PCR[rowTrain],
positive = "Y")
## Confusion Matrix and Statistics
##
## Reference
## Prediction N Y
## N 96 6
## Y 2 15
##
## Accuracy : 0.9328
## 95% CI : (0.8718, 0.9705)
## No Information Rate : 0.8235
## P-Value [Acc > NIR] : 0.0004702
##
## Kappa : 0.75
##
## Mcnemar's Test P-Value : 0.2888444
##
## Sensitivity : 0.7143
## Specificity : 0.9796
## Pos Pred Value : 0.8824
## Neg Pred Value : 0.9412
## Prevalence : 0.1765
## Detection Rate : 0.1261
## Detection Prevalence : 0.1429
## Balanced Accuracy : 0.8469
##
## 'Positive' Class : Y
##
caret::confusionMatrix(factor(rf.fit.pred.train),reference = data.pred$PCR[rowTrain],
positive = "Y")
## Confusion Matrix and Statistics
##
## Reference
## Prediction N Y
## N 98 7
## Y 0 14
##
## Accuracy : 0.9412
## 95% CI : (0.8826, 0.976)
## No Information Rate : 0.8235
## P-Value [Acc > NIR] : 0.0001479
##
## Kappa : 0.7671
##
## Mcnemar's Test P-Value : 0.0233422
##
## Sensitivity : 0.6667
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9333
## Prevalence : 0.1765
## Detection Rate : 0.1176
## Detection Prevalence : 0.1176
## Balanced Accuracy : 0.8333
##
## 'Positive' Class : Y
##
caret::confusionMatrix(factor(gbmB.fit.pred.train),reference = data.pred$PCR[rowTrain],
positive = "Y")
## Confusion Matrix and Statistics
##
## Reference
## Prediction N Y
## N 96 6
## Y 2 15
##
## Accuracy : 0.9328
## 95% CI : (0.8718, 0.9705)
## No Information Rate : 0.8235
## P-Value [Acc > NIR] : 0.0004702
##
## Kappa : 0.75
##
## Mcnemar's Test P-Value : 0.2888444
##
## Sensitivity : 0.7143
## Specificity : 0.9796
## Pos Pred Value : 0.8824
## Neg Pred Value : 0.9412
## Prevalence : 0.1765
## Detection Rate : 0.1261
## Detection Prevalence : 0.1429
## Balanced Accuracy : 0.8469
##
## 'Positive' Class : Y
##
univariate logistic for table 2
var = names(clinical)[-c(1,14)]
for (i in c(2:13,ncol(data.pred))) {
dat = data.pred[,c(i,14)]
#print(names(dat)[1])
res = glm(PCR ~ ., data = dat, family = binomial(link = "logit"))
print("Point.est")
print(exp(coef(res)[2]))
print(exp(confint(res)[2,]))
print("P.val")
print(coef(summary(res))[2,4])
}
## [1] "Point.est"
## Age1
## 1.158511
## 2.5 % 97.5 %
## 0.499636 2.577730
## [1] "P.val"
## [1] 0.7229797
## [1] "Point.est"
## SexMale
## 0.9969697
## 2.5 % 97.5 %
## 0.4435814 2.3652537
## [1] "P.val"
## [1] 0.9942765
## [1] "Point.est"
## GroupB
## 1.797386
## 2.5 % 97.5 %
## 0.7284238 5.1105784
## [1] "P.val"
## [1] 0.2305823
## [1] "Point.est"
## cT4
## 1.22
## 2.5 % 97.5 %
## 0.4178221 3.1395735
## [1] "P.val"
## [1] 0.6946095
## [1] "Point.est"
## cN1
## 2.049107
## 2.5 % 97.5 %
## 0.736501 7.289028
## [1] "P.val"
## [1] 0.2084891
## [1] "Point.est"
## cTNM3
## 1.701818
## 2.5 % 97.5 %
## 0.652946 5.318086
## [1] "P.val"
## [1] 0.3109151
## [1] "Point.est"
## MRF1
## 1.219512
## 2.5 % 97.5 %
## 0.5115039 2.7593941
## [1] "P.val"
## [1] 0.6413772
## [1] "Point.est"
## `Tumor length`1
## 0.6375
## 2.5 % 97.5 %
## 0.2784818 1.5329310
## [1] "P.val"
## [1] 0.2964886
## [1] "Point.est"
## `Tumor thickness`
## 1.035864
## 2.5 % 97.5 %
## 0.9803204 1.0907868
## [1] "P.val"
## [1] 0.1843168
## [1] "Point.est"
## Distance1
## 0.6446886
## 2.5 % 97.5 %
## 0.2943432 1.4160067
## [1] "P.val"
## [1] 0.2699332
## [1] "Point.est"
## CEA1
## 1.088776
## 2.5 % 97.5 %
## 0.4701839 2.4182741
## [1] "P.val"
## [1] 0.8373333
## [1] "Point.est"
## Differentiation1
## 1.419069
## 2.5 % 97.5 %
## 0.6440263 3.2673379
## [1] "P.val"
## [1] 0.3941766
## [1] "Point.est"
## radscore
## 3.052979
## 2.5 % 97.5 %
## 2.139603 4.840260
## [1] "P.val"
## [1] 6.061884e-08
multi-variate